import os
import yaml


def sentences_pair(src_file, tgt_file):
    paired_sentences = []
    with open(src_file, 'r', encoding='utf-8') as src_file, open(tgt_file, 'r', encoding='utf-8') as tgt_file:
        # 读取每个文件的行
        src_lines = src_file.readlines()
        tgt_lines = tgt_file.readlines()
    for src_line, tgt_line in zip(src_lines, tgt_lines):
        # 去除每行的空格
        src_line = src_line.strip()
        tgt_line = tgt_line.strip()
        # 如果德语或英语句子为空，跳过
        if src_line == '' or tgt_line == '':
            continue
        # 将德语和英语句子配对
        paired_sentences.append(tuple((src_line, tgt_line), ))

    return paired_sentences


def load_local_dataset(dataset_name: str = "Multi30k", config: dict = None) -> dict:
    if config is None:
        with open('config.yaml', 'r') as file:
            config = yaml.safe_load(file)
    path = config['datasets'][dataset_name]['path']
    splits = config['datasets'][dataset_name]['splits']
    res = dict()
    for key, value in splits.items():
        src_file = os.path.join(path, value[0])
        tgt_file = os.path.join(path, value[1])
        res[key] = sentences_pair(src_file, tgt_file)
    return res

